In [37]:
#Importing the different libraries to analyze the data

import plotly.offline as pyo
import plotly.graph_objs as go
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm
import statistics
%matplotlib inline
pyo.init_notebook_mode()
In [2]:
#Importing the dataset 

df = pd.read_csv('Iris.csv')
In [19]:
#Reading the dataset

df
Out[19]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns

In [4]:
# Dropping the ID column since there is no use for it, and does not contribute to our analysis 

df = df.drop(columns = ['Id'])
In [5]:
#Getting an understanding of the data, looking at the mean, median etc.

df.groupby(['Species'])
df.describe()
Out[5]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
In [10]:
df.median()
/tmp/ipykernel_124/530051474.py:1: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

Out[10]:
SepalLengthCm    5.80
SepalWidthCm     3.00
PetalLengthCm    4.35
PetalWidthCm     1.30
dtype: float64
In [ ]:
# For this expolratory data analysis set, I will be doing random analysis without any end goal (i.e. answering questions). 
# The point of EDA is ususally to discover patterns within the dataset, but since the Iris dataset has been around
# for a while, I thought that this would be a great oppurtunity to simply to showcase my EDA skills
In [30]:
# Looking into the number that each species has 

plt.title('Species Count')
sns.countplot(df['Species'])
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[30]:
<AxesSubplot:title={'center':'Species Count'}, xlabel='Species', ylabel='count'>
In [18]:
# Looking at the KDE plot for the dataset. KDE helps in visualizing the distribution of observations in a dataset. 
# Sepal Length and Sepal Width are both normally distributed, Petal Length and Petal Width do not seem normally distributed


sns.kdeplot(data=df)
Out[18]:
<AxesSubplot:ylabel='Density'>
In [42]:
# Scatterplot that plots SepalWidth vs SepalLength for each species of Iris flower

fig = px.scatter( df, x='SepalLengthCm', y='SepalWidthCm', color = "Species")
fig.show()
In [7]:
# Doing a regression analysis to see if there is any correlation between SepalWidth and SepalLength. 
# Iris-Setosa: 0.557681; This means that 55.7% of the data fits the regression model.
# Iris-Versicolor: 0.26582; This means that 26.5% of the data fits the regression model.
# Iris-Virginica: 0.209057; This means that 20.9% of the data fits the regression model.

fig = px.scatter( df, x='SepalLengthCm', y='SepalWidthCm', opacity=0.65, trendline='ols', color = "Species")
fig.show()
/opt/conda/lib/python3.8/site-packages/statsmodels/compat/pandas.py:65: FutureWarning:

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

In [8]:
# Scatterplot that plots PetalWidth vs PetalLength for each species of Iris flower

fig = px.scatter(df, x="PetalWidthCm", y="PetalLengthCm", color="Species")
fig.show()
In [9]:
# Doing a regression analysis to see if there is any correlation between SepalWidth and SepalLength. 
# Iris-Setosa: 0.093825; This means that 9.3% of the data fits the regression model.
# Iris-Versicolor: 0.618847; This means that 61.8% of the data fits the regression model.
# Iris-Virginica: 0.103754; This means that 10.4% of the data fits the regression model.

fig = px.scatter(df, x='PetalLengthCm', y='PetalWidthCm', opacity=0.65,trendline='ols', color = "Species")
fig.show()
In [33]:
#Determining Correlation

# Drawing a heatmap to look at the correlation between the different variables to determine if there is a 
# strong or weak correlation
# Taking a quick look at the correlation we can see that there seems to be strong correlation between:
# Petal Length and Petal Width; Sepal Length and Petal Width; Sepal Length and Petal Length 
# We can see that there is a strong negative correlation between: 
# Sepal Width and Petal Length;
# Sepal Width and Petal Width

q = df.corr()
sns.heatmap(q, annot = True)
Out[33]:
<AxesSubplot:>
In [8]:
sns.pairplot(df, hue="Species", palette="viridis", diag_kind="kde", height=3.5)
Out[8]:
<seaborn.axisgrid.PairGrid at 0x7fc0d13985e0>
In [9]:
# Drawing a boxplot for each of the variables. The green triangle represents the mean. This has combined all the different species together
# For Sepal Length we can see that the mean is slightly above the median, meaning that distribution is slightly positively skewed
# For Sepal Width we can see that the mean is slightly above the median, meaning that the distribution is slightly positively skewed
# For Petal Length we can see that the mean is below the median, meaning that the distribution is negatively skewed
# For Petal Length we can see that the mean is slightly below the median, meaning that the distribution is slightly negatively skewed

sns.catplot(data=df, orient="h", kind="box", showmeans = True)
Out[9]:
<seaborn.axisgrid.FacetGrid at 0x7fc0d0886af0>
In [11]:
# We can furthur dissect this by looking at the skewness of the dataframe.
# For Sepal Length we can see that it is positivly skewed
# For Sepal Width we can see that it is positively skewed
# For Petal Length we can see that it is negatively skewed
# For Petal Width we can see that it is negatively skewed 

df.skew()
/tmp/ipykernel_124/1665899112.py:1: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

Out[11]:
SepalLengthCm    0.314911
SepalWidthCm     0.334053
PetalLengthCm   -0.274464
PetalWidthCm    -0.104997
dtype: float64
In [12]:
# We can furthur dissect this by looking at the kurtosis of the dataframe

df.kurt()
/tmp/ipykernel_124/1257127604.py:1: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

Out[12]:
SepalLengthCm   -0.552064
SepalWidthCm     0.290781
PetalLengthCm   -1.401921
PetalWidthCm    -1.339754
dtype: float64
In [ ]:
# A violinplot is used to visualize the distribution of numerical data. 
In [13]:
fig = px.violin(df, y="SepalLengthCm", x="Species", color="Species", box=True, points="all",
          hover_data=df.columns)
fig.show()
In [24]:
fig = px.violin(df, y="SepalWidthCm", x="Species", color="Species", box=True, points="all",
          hover_data=df.columns)
fig.show()
In [25]:
fig = px.violin(df, y="PetalLengthCm", x="Species", color="Species", box=True, points="all",
          hover_data=df.columns)
fig.show()
In [26]:
fig = px.violin(df, y="PetalWidthCm", x="Species", color="Species", box=True, points="all",
          hover_data=df.columns)
fig.show()
In [34]:
fig = px.scatter(df, x="PetalWidthCm", y="SepalLengthCm", color="Species", size="PetalWidthCm")

fig.update_layout(legend=dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1))

fig.show()